/*
This is the main program that controls the matching of officer information across datasets. It proceeds in 5 steps
1) Read in CDP personnel profile information. This is the most complete and detailed dataset of
officers and their attributes and will serve as the backbone of the matching program
2) Perform a fuzzy match between the profile data and each of our 6 other datasets that contain information
about police officers
3) Combine the UNmatched records from the 6 other datasets with the profile data and deduplicate
Using this as our main matching set moving forward will permit matching officers who appear in
multiple non-profile datasets, though this is rare
4) Match each of the 6 datasets to the expanded profile data again
5) Save versions of each dataset with a master ID (mid) that allows quick matching
between them
*/

*07/05/2021
*File to attach ids to FOIA files in CPD. First start with profiles FOIA data and combine this with a list of unmatched individuals in other FOIA data. Create a masterid of officers and attach these ids to the FOIA data

**Housekeeping
local ver_suff _9_21
clear all
cap log close
if "`c(username)'"=="taehokim" |"`c(username)'"=="Taeho Kim"{ 
	cd "/Users/`c(username)'/Dropbox (Penn Law)/misconduct/do_files`ver_suff'"
} 
else{
	cd "/Users/`c(username)'/Dropbox/misconduct/do_files`ver_suff'"
}
log using ../logs`ver_suff'/FOIA_clean_master.log, text replace
set more off


 
**
*******************
*BRING IN PROFILES DATA
*******************
**Read raw profiles data
insheet using ../FOIA/FOIA_profiles_051821/17356-P661039-All-Sworn-YTD_Current-Asgn-Positionexport.csv

 **Record name
gen fname = trim(firstname)
gen lname = trim(LASTNAME)
gen mi = trim(middleinitial)

**Record appontment date
gen appoint_dt = date(appointeddate, "DMY", 2025)
gen appoint_yr = year(appoint_dt)
gen appoint_m = month(appoint_dt)
gen appoint_d = day(appoint_dt)

**Some obs have variations of last names: deflate last names
gen lname_mod = subinstr(lname, "-", "", .)
replace lname_mod = subinstr(lname_mod, " ", "", .)
replace lname_mod = subinstr(lname_mod, ".", "", .)
 
**Drop to unique records by name, appointment date, and birthyear
duplicates drop fname lname_mod mi appoint_yr appoint_m appoint_d birthyear, force

sort fname lname_mod mi appoint_yr appoint_m appoint_d birthyear 
 
**Record soundex of name
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)

**Make a copy of every variable for self-matching later
foreach var in fname lname_mod fname_sdx lname_sdx mi appoint_yr appoint_m appoint_d birthyear{
	gen u_`var' = `var'
}
gen u_pid = _n

**Create a variable to record where the data came from for matching later
gen source = "profiles"

**Save intermediate profiles data
save ../processed_data`ver_suff'/FOIA_processing/profiles_pid, replace

*******************
*SAVE UNMATCHED INDIVIDUALS IN EACH DATASETS
*******************  

//pass along the profiles data so that I can match each dataset with this profiles data

**Loop over all data sources
foreach file in complaints arrests salary unithistory awards trr AA{
	
	**Run the individual cleaning/matching program for this dataset
	do ../do_files`ver_suff'/FOIA_clean_`file' ../processed_data`ver_suff'/FOIA_processing/profiles_pid
	
	**Keep only records that were not matched
	keep if _merge == 1
	
	**Restrict to match-relevant variables for each set
	if "`file'" == "complaints" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d sex race  
		
	}
	if "`file'" == "arrests" {
		
		drop if fname == "REDACTED"  
		keep  fname lname mi appoint_yr appoint_m appoint_d 
	}
	if "`file'" == "salary" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d sex	
	}
	if "`file'" == "unithistory" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d sex race 
	}
	if "`file'" == "awards" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d sex race resignationdate 
	}
	if "`file'" == "trr" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d  
	}
	if "`file'" == "AA" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d sex race 
	}
	
	gen source = "`file'"
	
	**Save dataset of unmatched records
	save ../processed_data`ver_suff'/FOIA_processing/unmatched_`file'.dta, replace
	
}


*******************
*BUNDLE ALL UNMATCHED OFFICERS IN ALL FOIA DATA AND DEDUPLICATE
*******************   
 
use ../processed_data`ver_suff'/FOIA_processing/unmatched_complaints.dta, clear

foreach file in complaints arrests salary unithistory awards trr AA{
	append using ../processed_data`ver_suff'/FOIA_processing/unmatched_`file'.dta
}

gsort  lname_mod fname appoint_yr appoint_m appoint_d birthyear -sex -race //keep the most complete profiles

by  lname_mod fname appoint_yr appoint_m appoint_d birthyear: gen ct = _n
keep if ct == 1
drop ct

 
  

//some obs that have missing info are actually copies of other officers. Drop these
// drop if missing birthyear but other info is duplicated
gsort lname_mod fname appoint_yr appoint_m appoint_d birthyear -sex -race
by  lname_mod fname appoint_yr appoint_m appoint_d: gen ct = _n
drop if ct != 1 & birthyear == .
drop ct

// drop if missing appoint year but other info is duplicated
gsort  lname_mod fname birthyear appoint_yr appoint_m appoint_d  -sex -race
by  lname_mod fname birthyear: gen ct = _n
drop if ct != 1 & appoint_yr == .
drop ct

 

//drop if missing appoint dates but names are duplicated
gsort  lname_mod fname appoint_yr appoint_m appoint_d birthyear -sex -race
by  lname_mod fname: gen ct = _n
drop if ct != 1 & (appoint_yr ==. & appoint_m==. & appoint_d==.) 
drop ct

 
**Resignation date only exists in awards data
replace resignationdate = "NA" if source != "awards"

//clean/unify race variable
replace race = "WHITE" if inlist(race, "WHI", "WWH", "White")
replace race = "BLACK" if inlist(race, "BLK", "Black or African American")
replace race = "AMER IND/ALASKAN NATIVE" if inlist(race, "AMER INDIAN / ALASKAN NATIVE", "American Indian or Alaska Native")
replace race = "ASIAN/PACIFIC ISLANDER" if inlist(race, "API", "ASIAN / PACIFIC ISLANDER", "Asian")
replace race = "UNKNOWN" if inlist(race, "I", "NULL", "Not listed", "S", "U", "UNKNOWN / REFUSED", "Unknown")
replace race = "HISPANIC" if inlist(race, "Hispanic, Latino, or Spanish origin", "WHITE HISPANIC", "BLACK HISPANIC")
replace race = "ASIAN/PACIFIC ISLANDER" if strpos(race, "Hawaiian") 

**Drop if any missing or unusual name information
drop if fname == "" 
drop if lname < "A" 

 **Unify sex variable
replace sex = "F" if sex == "Female"
replace sex = "M" if sex == "Male" 


 
*******************
*COMBINE UNMATCHED OFFICERS WITH ORIGINAL PROFILES DATA AND DEDUPLICATE BY FUZZY MATCHING
*******************   

**Append unmatched records from other datasets with profiles data
append using ../processed_data`ver_suff'/FOIA_processing/profiles_pid 

keep race fname lname  lname_mod mi appoint_yr appoint_m appoint_d birthyear sex source resignationdate status employeepositioncd description unitassigned unitassigneddescription star1 star2 star3 star4 star5 star6 star7 star8 star9 star10

sort  lname_mod fname mi appoint_yr appoint_m appoint_d birthyear

gen pid = _n

tempfile profiles 
save `profiles'
 
gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)

//make identical copy of profiles as a matching dataset to identify duplicates

preserve

foreach var in fname  lname_mod fname_sdx lname_sdx mi appoint_yr appoint_m appoint_d birthyear{
	gen u_`var' = `var'
}

 
gen u_pid_own = _n

tempfile profiles_temp
save `profiles_temp'
 

restore

**Match unmatched records + profiles to themselves
do ../do_files`ver_suff'/FOIA_clean_own `profiles_temp'


//Identify duplicate observations

**Load the profiles data matched to itself
use ../processed_data`ver_suff'/FOIA_processing/pid_appending, clear
 
**Find pairs of PIDs that were matched to each other by creating groups baed
**on the high and low pids on both sides of the match. If pid 1 matched to pid 2
**then both the record with pid=1 and u_pid=2 an pid=2 and u_pid=1 will have low=1 and high=2
egen low = rowmin(pid u_pid_own)
egen high = rowmax(pid u_pid_own)

**Restrict only to pairs (this drops one observation)
sort low high
by low high: gen ct = _N 
keep if ct == 2

**Bring back the profiles data, but drop any records that match to 
**a duplicate identified here. Specifically, the one with the higher pid,
**which indicates that it is an observation from non-profiles data and
**so has fewer variables
keep if pid == high
merge m:1 pid using `profiles'

drop if _merge == 3
drop _merge u_pid_own match_no low high ct



*******************
*ASSIGN MASTERIDS TO THE NEW PROFILES DATA
*******************   
 
**Create a new master ID (mid)
sort fname  lname_mod mi appoint_yr appoint_m appoint_d birthyear
gen mid = _n

drop pid 

**Save a finalized version of the profiles (+ unmatched records from other datasets)
save ../processed_data`ver_suff'/FOIA_unified/profiles.dta, replace

**Restore pid=mid for second matching pass
gen pid = _n




*******************
*ATTEMPT MATCHING MASTER PROFILES DATA WITH ALL FOIA DATA
*******************   


//set up profiles data to use for the fuzzy matching function
preserve

gen fname_sdx = soundex(fname)
gen lname_sdx = soundex(lname)

foreach var in fname  lname_mod fname_sdx lname_sdx mi appoint_yr appoint_m appoint_d birthyear{
	gen u_`var' = `var'
}

 
gen u_pid = _n

save ../processed_data`ver_suff'/FOIA_processing/profiles_mid_temp, replace

restore




//Match the new expanded profiles dataset to each input dataset

qui foreach file in complaints arrests salary unithistory awards trr AA{
	do ../do_files`ver_suff'/FOIA_clean_`file' ../processed_data`ver_suff'/FOIA_processing/profiles_mid_temp
	
	*This time focus on the records that did match, which should be all of them
	keep if _merge == 3
	
	noisily display("`file'")
	noisily list if fname=="AARON" & lname=="CUNNINGHAM" & appoint_yr==1998 & appoint_m==6 & appoint_d==29
	
	if "`file'" == "complaints" {
		keep birthyear fname lname  lname_mod mi appoint_yr appoint_m appoint_d sex race u_pid 
 
	}
	if "`file'" == "arrests" {
		
		drop if fname == "REDACTED"  
		keep  fname lname  lname_mod mi appoint_yr appoint_m appoint_d  u_pid
		duplicates drop fname  lname_mod appoint_yr appoint_m appoint_d, force
	}
	if "`file'" == "salary" {
		keep birthyear fname  lname_mod lname mi appoint_yr appoint_m appoint_d sex u_pid	
 
	}
	if "`file'" == "unithistory" {
		keep birthyear fname  lname_mod lname mi appoint_yr appoint_m appoint_d sex race u_pid 
 
	}
	if "`file'" == "awards" {
		keep birthyear fname  lname_mod lname mi appoint_yr appoint_m appoint_d sex race resignationdate u_pid 
 
	}
	if "`file'" == "trr" {
		keep birthyear fname  lname_mod lname mi appoint_yr appoint_m appoint_d u_pid  
	
	}
	if "`file'" == "AA" {
		keep birthyear fname lname lname_mod mi appoint_yr appoint_m appoint_d sex race u_pid
	}
	capture duplicates drop fname  lname_mod appoint_yr appoint_m appoint_d birthyear, force
			
	tempfile matched
	save `matched'	

	
	//merge with original FOIA data using matching relevant information (which the matched dataset is unique in. A
	use ../processed_data`ver_suff'/FOIA_processing/`file', clear
	
	**rrests data doesn't have birthyear so needs a special treatment in merging
	
	if "`file'" != "arrests"{
		merge m:1 fname  lname_mod appoint_yr appoint_m appoint_d birthyear using `matched'
	}
	
	else if "`file'" == "arrests"{
	merge m:1 fname  lname_mod appoint_yr appoint_m appoint_d  using `matched'
	}
	
	**Rename the IDs assigned from the expanded profile data to user IDs
	rename u_pid mid
	drop _merge
	save ../processed_data`ver_suff'/FOIA_unified/`file'.dta, replace
	
}


clear
cap log close
